{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "melaflefon.ipynb",
      "private_outputs": true,
      "provenance": [],
      "machine_shape": "hm",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/eyaler/avatars4all/blob/master/melaflefon.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "802K_fw2kLbV"
      },
      "source": [
        "# Wav2Lip Demo Colab\n",
        "\n",
        "## Original project: http://bhaasha.iiit.ac.in/lipsync\n",
        "\n",
        "### Original repo: https://github.com/Rudrabha/Wav2Lip\n",
        "\n",
        "#### Made just a little bit more accessible by Eyal Gruss ([@eyaler](https://twitter.com/eyaler) / [eyalgruss.com](https://eyalgruss.com) / [eyalgruss@gmail.com](mailto:eyalgruss@gmail.com))\n",
        "\n",
        "#### Short link here: https://j.mp/wav2lip\n",
        "\n",
        "##### Original notebook: https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8\n",
        "\n",
        "##### Speaker diarization with: https://github.com/tyiannak/pyAudioAnalysis,  https://github.com/pyannote/pyannote-audio\n",
        "\n",
        "##### Automatic animated skit creation \"Wav2Skit\" examples: https://twitter.com/eyaler/status/1348769977663451136\n",
        "\n",
        "##### Combined with First Order Motion Model: https://j.mp/vid2head\n",
        "\n",
        "##### Checkout more cool avatar notebooks @ avatars4all repository: https://github.com/eyaler/avatars4all\n",
        "\n",
        "##### A curated list of online generative tools (outdated): https://j.mp/generativetools"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "P3LihClHbUd3",
        "cellView": "form"
      },
      "source": [
        "#@title Dub it!\n",
        "#@markdown 1. Choose audio (you can also enter a YouTube or similar URL, or a manually uploaded file name):\n",
        "audio = 'Dangerous time' #@param ['Dangerous time', 'Go home', 'Sound of victory', 'Ernie and Bert (2 speakers)', '11,780 votes (3 speakers)', 'מלפפונים חמוצים', 'שונאת שמאלנים', 'אני שולה', 'אריק ובנץ (2 דוברים)', 'אריק ובנץ וגנץ (3 דוברים)', 'Grab from uploaded video'] {allow-input: true}\n",
        "#@markdown 2. Optionally untick \"smooth_face_detection\" to disable temporal smoothing of face coordinates:\n",
        "smooth_face_detection = True #@param {type: \"boolean\"}\n",
        "#@markdown 3. Optionally tick \"override_face_detection\" to manually asign face coordinates:\n",
        "override_face_detection = False #@param {type: \"boolean\"}\n",
        "left = 0# @param {type: \"integer\"}\n",
        "top = 0# @param {type: \"integer\"}\n",
        "width = 1080 #@param {type: \"integer\"}\n",
        "height = 1920 #@param {type: \"integer\"}\n",
        "#@markdown 4. Optionally tick \"switch_speakers\" to switch between visual media files with the change of speakers:\n",
        "switch_speakers = False #@param {type: \"boolean\"}\n",
        "#@markdown 5. Choose model for speaker diarization:\n",
        "model = 'pyannote-audio DIHARD' #@param ['pyAudioAnalysis', 'pyannote-audio DIHARD','pyannote-audio AMI']\n",
        "#@markdown 6. Optionally tick \"reuse_files\" to reuse previously uploaded files:\n",
        "reuse_files = False #@param {type: \"boolean\"}\n",
        "#@markdown 7. Press the play (triangle) button on the left.\n",
        "#@markdown 8. Press \"Browse\" or \"Choose files\" below, and upload image(s) or video(s) (if not reusing files).\n",
        "#@markdown 9. If the resulting videos are too large, the Colab might disconnect, but you may still manually download the .mp4 from the folder on the left (click \"Refresh\" if missing).\n",
        "\n",
        "from google.colab import files\n",
        "try:\n",
        "  inputs\n",
        "except NameError:\n",
        "  reuse_files = False\n",
        "\n",
        "if not reuse_files:\n",
        "  %cd /content/sample_data\n",
        "  !rm -rf *\n",
        "  inputs = files.upload()\n",
        "\n",
        "if inputs:\n",
        "  %cd /content\n",
        "  !git clone --depth 1 https://github.com/eyaler/Wav2Lip\n",
        "  import os\n",
        "  !pip install librosa==0.9.2\n",
        "  !pip install -U gdown\n",
        "  if not os.path.exists('/content/Wav2Lip/checkpoints/wav2lip_gan.pth'):\n",
        "    !gdown https://drive.google.com/uc?id=1dwHujX7RVNCvdR1RR93z0FS2T2yzqup9 -O /content/Wav2Lip/checkpoints/wav2lip_gan.pth\n",
        "  !wget --no-check-certificate -nc https://eyalgruss.com/fomm/wav2lip_gan.pth -O /content/Wav2Lip/checkpoints/wav2lip_gan.pth\n",
        "  #!wget --no-check-certificate -nc https://eyalgruss.com/fomm/wav2lip.pth -O /content/Wav2Lip/checkpoints/wav2lip.pth\n",
        "  !wget --no-check-certificate -nc https://eyalgruss.com/fomm/s3fd-619a316812.pth -O /content/Wav2Lip/face_detection/detection/sfd/s3fd.pth\n",
        "  !pip install git+https://github.com/ytdl-org/youtube-dl\n",
        "  grab = False\n",
        "  manual = False\n",
        "  if '://' in audio:\n",
        "    if os.path.exists('/content/custom.mp3'):\n",
        "      os.remove('/content/custom.mp3')\n",
        "    !youtube-dl --no-playlist --extract-audio --audio-format mp3 \"$audio\" -o \"/content/custom.%(ext)s\"\n",
        "    audio = 'custom'\n",
        "  elif audio=='Dangerous time':\n",
        "    audio = 'dangerous'\n",
        "    if not os.path.exists('/content/dangerous.mp3'):\n",
        "      !youtube-dl --no-playlist --extract-audio --audio-format mp3 https://www.youtube.com/watch?v=cQ54GDm1eL0 -o \"/content/dangerous.%(ext)s\"\n",
        "  elif audio=='Go home':\n",
        "    audio = 'gohome'\n",
        "    !wget --no-check-certificate -nc https://eyalgruss.com/fomm/gohome.mp3\n",
        "  elif audio=='Sound of victory':\n",
        "    audio = 'victory'\n",
        "    if not os.path.exists('/content/victory.mp3'):\n",
        "      !youtube-dl --no-playlist --extract-audio --audio-format mp3 https://www.youtube.com/watch?v=Nu96Fhl1Gjo -o \"/content/victory.%(ext)s\"\n",
        "  elif audio=='Ernie and Bert (2 speakers)':\n",
        "    audio = 'dialog_eng'\n",
        "    if not os.path.exists('/content/dialog_eng.mp3'):\n",
        "      !youtube-dl --no-playlist --extract-audio --audio-format mp3 https://www.youtube.com/watch?v=I78YAciQpr0 -o \"/content/dialog_eng.%(ext)s\"\n",
        "  elif audio == '11,780 votes (3 speakers)':\n",
        "    audio = 'trialog_eng'\n",
        "    if not os.path.exists('/content/trialog_eng.mp3'):\n",
        "      !youtube-dl --no-playlist --extract-audio --audio-format mp3 https://www.youtube.com/watch?v=o3hrN0cP58Y -o \"/content/trialog_heb.%(ext)s\"\n",
        "  elif audio == 'מלפפונים חמוצים':\n",
        "    audio = 'melaflefon'\n",
        "    !wget --no-check-certificate -nc https://eyalgruss.com/fomm/melaflefon.mp3\n",
        "  elif audio == 'שונאת שמאלנים':\n",
        "    audio = 'sonet'\n",
        "    !wget --no-check-certificate -nc https://eyalgruss.com/fomm/sonet.mp3\n",
        "  elif audio == 'אני שולה':\n",
        "    audio = 'shoula'\n",
        "    !wget --no-check-certificate -nc https://eyalgruss.com/fomm/shoula.mp3\n",
        "  elif audio == 'אריק ובנץ (2 דוברים)':\n",
        "    audio = 'dialog_heb'\n",
        "    if not os.path.exists('/content/dialog_heb.mp3'):\n",
        "      !youtube-dl --no-playlist --extract-audio --audio-format mp3 https://www.youtube.com/watch?v=rrZ3bo4VmpQ -o \"/content/dialog_heb.%(ext)s\"\n",
        "  elif audio == 'אריק ובנץ וגנץ (3 דוברים)':\n",
        "    audio = 'trialog_heb'\n",
        "    if not os.path.exists('/content/trialog_heb.mp3'):\n",
        "      !youtube-dl --no-playlist --extract-audio --audio-format mp3 https://www.youtube.com/watch?v=HOKJnkG5MXQ -o \"/content/trialog_heb.%(ext)s\"\n",
        "  elif audio == 'Grab from uploaded video':\n",
        "    grab = True\n",
        "  elif audio == '':\n",
        "    audio = 'custom'\n",
        "  else:\n",
        "    manual = True\n",
        "  audio = '/content/'+audio\n",
        "  if manual:\n",
        "    for ext in ['mp3','wav','m4a','aac','ogg','flac','wma','aiff','opus','amr','ac3','mp4']:\n",
        "      if os.path.exists(audio+'.'+ext):\n",
        "        audio += '.'+ext\n",
        "        break\n",
        "      if os.path.exists(audio+'.'+ext.upper()):\n",
        "        audio += '.'+ext.upper()\n",
        "        break\n",
        "  else:\n",
        "    audio += '.mp3'\n",
        "  assert grab or os.path.exists(audio), 'Error: could not find audio file: '+audio\n",
        "\n",
        "  %cd /content/Wav2Lip\n",
        "  outputs = []\n",
        "  for im in list(inputs):\n",
        "    !rm -rf /content/Wav2Lip/temp/*\n",
        "    infile = '/content/sample_data/'+im\n",
        "    ext = infile.rsplit('.',1)[1]\n",
        "    if ext != ext.lower() or \"'\" in infile or ' ' in infile:\n",
        "      ext = ext.lower()\n",
        "      lower = infile.rsplit('.',1)[0].replace(\"'\",'').replace(' ','_')+'.'+ext\n",
        "      !rm -rf \"$lower\"\n",
        "      os.rename(infile, lower)\n",
        "      infile = lower\n",
        "      data = inputs.pop(im)\n",
        "      im = infile.rsplit('/',1)[1]\n",
        "      inputs[im] = data\n",
        "    outfile = '/content/'+im.rsplit('.',1)[0]+'_out.mp4'\n",
        "    !rm -rf \"$outfile\"\n",
        "    if grab:\n",
        "      audio = infile\n",
        "    elif \"'\" in audio:\n",
        "      fix = audio.replace(\"'\",'').replace(' ','_')\n",
        "      !rm -rf \"$fix\"\n",
        "      os.rename(audio, fix)\n",
        "      audio = fix\n",
        "    if not override_face_detection:\n",
        "      nosmooth = '' if smooth_face_detection else '--nosmooth'\n",
        "      !python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face \"$infile\" --audio \\\"\\\"$audio\\\"\\\" --pads 0 20 0 0 $nosmooth --outfile \\\"\\\"$outfile\\\"\\\"\n",
        "    if override_face_detection or os.path.exists('/content/Wav2Lip/temp/faulty_frame.jpg'):\n",
        "      import cv2\n",
        "      if override_face_detection:\n",
        "        print('\\nOverriding face detection')\n",
        "      else:\n",
        "        print('\\nFace not detected - will use whole frame')\n",
        "      if ext in ['jpg', 'png', 'jpeg']:\n",
        "        frame = cv2.imread(infile)\n",
        "      else:\n",
        "        video_stream = cv2.VideoCapture(infile)\n",
        "        still_reading, frame = video_stream.read()\n",
        "      y2,x2 = frame.shape[:2]\n",
        "      if override_face_detection:\n",
        "        x1 = left\n",
        "        y1 = top\n",
        "        x2 = min(left+width, x2)\n",
        "        y2 = min(top+height, y2)\n",
        "      else:\n",
        "        x1 = y1 = 0\n",
        "        if x2>y2:\n",
        "          x1 = (x2-y2)//2\n",
        "          x2 = x1+y2\n",
        "      !python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face \"$infile\" --audio \\\"\\\"$audio\\\"\\\" --box $y1 $y2 $x1 $x2 --pads 0 20 0 0 --outfile \\\"\\\"$outfile\\\"\\\"\n",
        "    outputs.append(outfile)\n",
        "\n",
        "  wav = None\n",
        "  if switch_speakers and len(outputs)>1 and not grab:\n",
        "    wav = audio.rsplit('.',1)[0]+'.wav'\n",
        "    !ffmpeg -i \"$audio\" \"$wav\" -y\n",
        "    min_dt = 0.5\n",
        "    if model.startswith('pyannote-audio'):\n",
        "      !pip install pyannote.core==4.1 pyannote.pipeline==1.5.2 pyannote.audio==1.1.1\n",
        "      !wget -nc https://raw.githubusercontent.com/pyannote/pyannote-audio/1.1.1/hubconf.py -P /root/.cache/torch/hub/pyannote_pyannote-audio_main\n",
        "      import torch\n",
        "      import pyannote.core  # See: https://github.com/pyannote/pyannote-audio/issues/561\n",
        "      from pyannote.audio.features.utils import get_audio_duration\n",
        "      if model.endswith('AMI'):\n",
        "        pipeline = torch.hub.load('pyannote/pyannote-audio', 'dia_ami', trust_repo=True)\n",
        "      else:\n",
        "        pipeline = torch.hub.load('pyannote/pyannote-audio', 'dia', trust_repo=True)\n",
        "      cls = pipeline({'audio':wav})\n",
        "      tmp_segs = [((s.start,s.end),l) for s,_,l in cls.itertracks(yield_label=True)]\n",
        "      segs = []\n",
        "      prev_ind = None\n",
        "      prev_start = None\n",
        "      for (start,end),ind in tmp_segs+[((get_audio_duration({'audio':wav}),None),None)]:\n",
        "        if ind!=prev_ind:\n",
        "          if prev_ind is not None:\n",
        "            segs.append([(prev_start,start),prev_ind])\n",
        "          prev_ind = ind\n",
        "          prev_start = start\n",
        "    elif model=='pyAudioAnalysis':\n",
        "      !pip install hmmlearn==0.2.8\n",
        "      !pip install eyeD3==0.9.5\n",
        "      !pip install pydub==0.24.0\n",
        "      !pip install pyAudioAnalysis==0.3.14\n",
        "      from pyAudioAnalysis import audioSegmentation as aS\n",
        "      mid_window=2\n",
        "      mid_step=0.2\n",
        "      short_window=0.05\n",
        "      lda_dim=0 #35\n",
        "      cls = aS.speaker_diarization(wav, len(outputs), mid_window=mid_window, mid_step=mid_step, short_window=short_window, lda_dim=lda_dim)[0]\n",
        "      segs = list(zip(*aS.labels_to_segments(cls, mid_step)))\n",
        "    deleted = 0\n",
        "    unified = 0\n",
        "    if min_dt:\n",
        "      for i in range(len(segs)-1,0,-1):\n",
        "        if segs[i][0][1]-segs[i][0][0]<min_dt:\n",
        "          if i+1<len(segs) and segs[i-1][1] == segs[i+1][1]:\n",
        "            segs[i-1] = ((segs[i-1][0][0],segs[i+1][0][1]),segs[i-1][1])\n",
        "            del segs[i+1]\n",
        "            unified += 1\n",
        "          else:\n",
        "            segs[i-1] = ((segs[i-1][0][0],segs[i][0][1]),segs[i-1][1])\n",
        "          del segs[i]\n",
        "          deleted += 1\n",
        "    inds = {}\n",
        "    my_ind = 0\n",
        "    with open('/content/list.txt','w',encoding='utf8') as f:\n",
        "      for i,((start,end),ind) in enumerate(segs):\n",
        "        if ind not in inds:\n",
        "          inds[ind] = my_ind%len(outputs)\n",
        "          my_ind += 1\n",
        "        f.write(\"file '%s'\\n\"%outputs[inds[ind]])\n",
        "        if i>0:\n",
        "          f.write('inpoint %f\\n'%start)\n",
        "        if i<len(segs)-1:\n",
        "          f.write('outpoint %f\\n'%end)\n",
        "    !ffmpeg -f concat -safe 0 -i /content/list.txt -i \"{outputs[0]}\" -map 0:v -map 1:a -c:v libx264 -c:a aac -vf \"crop=trunc(iw/2)*2:trunc(ih/2)*2\" -pix_fmt yuv420p -profile:v baseline -movflags +faststart /content/combined.mp4 -y\n",
        "    new_outputs = ['/content/combined.mp4']\n",
        "    if len(outputs)==2:\n",
        "      with open('/content/list2.txt','w',encoding='utf8') as f:\n",
        "        for i,((start,end),ind) in enumerate(segs):\n",
        "          f.write(\"file '%s'\\n\"%outputs[1-inds[ind]])\n",
        "          if i>0:\n",
        "            f.write('inpoint %f\\n'%start)\n",
        "          if i<len(segs)-1:\n",
        "            f.write('outpoint %f\\n'%end)\n",
        "      !ffmpeg -f concat -safe 0 -i /content/list2.txt -i \"{outputs[1]}\" -map 0:v -map 1:a -c:v libx264 -c:a aac -vf \"crop=trunc(iw/2)*2:trunc(ih/2)*2\" -pix_fmt yuv420p -profile:v baseline -movflags +faststart /content/combined2.mp4 -y\n",
        "      new_outputs.append('/content/combined2.mp4')\n",
        "    outputs = new_outputs\n",
        "\n",
        "  from IPython.display import HTML, clear_output\n",
        "  from base64 import b64encode\n",
        "\n",
        "  clear_output()\n",
        "  print('if video does not show below, you can still download it!')\n",
        "  if wav:\n",
        "    print('speakers=%d segments=%d deleted=%d unified=%d'%(len(inds), len(segs),deleted,unified))\n",
        "  muted = 'muted'\n",
        "\n",
        "  for i,file in enumerate(reversed(outputs)):\n",
        "    if i==len(outputs)-1:\n",
        "      muted = ''\n",
        "    try:\n",
        "      with open(file, 'rb') as f:\n",
        "        data_url = \"data:video/mp4;base64,\" + b64encode(f.read()).decode()\n",
        "      display(HTML(\"\"\"\n",
        "      <video width=600 controls autoplay loop %s>\n",
        "            <source src=\"%s\" type=\"video/mp4\">\n",
        "      </video>\"\"\" % (muted,data_url)))\n",
        "    except Exception:\n",
        "      pass\n",
        "  if wav:\n",
        "    print('speakers=%d segments=%d deleted=%d unified=%d'%(len(inds), len(segs),deleted,unified))\n",
        "  for file in outputs:\n",
        "    try:\n",
        "      files.download(file)\n",
        "    except Exception:\n",
        "      pass"
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}